# Alexander F. Gazmararian
# agazmararian@gmail.com

library(here)
library(tidyverse)
library(tidylog)
library(tigris)
library(sf)
library(glue)

source(here("R", "visibility", "replication_mode.R"))

# Replication mode detection ----
# This script CREATES the geo file, so we can't use standard mode detection
# Instead, check if the required INPUT (geocoding cache) exists
geocoding_cache <- here("data", "cache", "geocoding", "zip_code_geoloc.csv")
survey_input <- here("data", "cache", "survey_visibility_processed.rds")

# Check if we have the inputs needed to run geo processing
can_run_geo <- file.exists(geocoding_cache) && file.exists(survey_input)

if (!can_run_geo) {
  message("=== ANONYMIZED MODE ===")
  message("Skipping geo-identifier processing (geocoding cache not available)")
  message("Merge script will use non-geo survey data with cached distance calculations")
} else {
  message("=== FULL MODE ===")
  message("Running geo-identifier processing (geocoding cache available)")
  
# Load survey data
g <- readRDS(here("data", "cache", "survey_visibility_processed.rds"))

# Add longitude/latitude coordinates based on surveyed ZIP codes
zips <- read_csv(here("data", "cache", "geocoding", "zip_code_geoloc.csv"), show_col_types = FALSE)
zips <- zips %>%
  group_by(zip) %>%
  summarize(
    lon = mean(lon),
    lat = mean(lat)
  ) %>%
  rename(
    lon_zip = lon,
    lat_zip = lat
  )
g <- left_join(g, zips, by = c("zip_survey" = "zip"))

# Impute missing ZIP codes from survey with IP values
message(glue("Number of ZIP codes with missing coordinates: {sum(is.na(g$lon_zip))}"))
message(glue("Number of ZIP codes with missing coordinates: {sum(is.na(g$lat_zip))}"))
g <- g %>%
  mutate(
    lon_zip = coalesce(lon_zip, lon_ip),
    lat_zip = coalesce(lat_zip, lat_ip)
  )

# Create indicator for anomalous locations based on mismatch between
# IP address indicated coordinates and ZIP code based coordinates
resid.lon <- lm(lon_ip ~ lon_zip, g) %>%
  resid()
g$lon_anom <- as.integer(resid.lon > sd(resid.lon) * 2)
resid.lat <- lm(lat_ip ~ lat_zip, g) %>%
  resid()
g$lat_anom <- as.integer(resid.lat > sd(resid.lat) * 2)
g$loc_anom <- as.integer(g$lon_anom == 1 | g$lat_anom == 1)

# Convert longitude/latitude to spatial object
g <- st_as_sf(g, coords = c("lon_zip", "lat_zip"), crs = 4326, remove = FALSE)
# Use North America Albers Equal Area Conic
# it preserves area, which is important for comparing quantities that depend on area like distance
g <- st_transform(g, crs = "ESRI:102003")

# Get county Census Bureau FIPS identifiers for merging
counties <- counties(cb = TRUE, year = 2023)
counties <- st_transform(counties, st_crs(g))
names(counties) <- tolower(names(counties))
counties <- counties %>%
  mutate(fips = as.numeric(paste0(statefp, countyfp))) %>%
  rename(county_name = name) %>%
  dplyr::select(fips, state_name, county_name)
counties <- counties %>%
  rename(
    state = state_name,
    county = county_name
  )
g <- st_join(g, counties, left = TRUE)

# Get county FIPS prior to the reorganization of some
counties.pre <- counties(cb = TRUE, year = 2018)
counties.pre <- st_transform(counties.pre, st_crs(g))
names(counties.pre) <- tolower(names(counties.pre))
counties.pre <- counties.pre %>%
  mutate(fips.pre = as.numeric(paste0(statefp, countyfp))) %>%
  dplyr::select(fips.pre)
g <- st_join(g, counties.pre, left = TRUE)

# Create identifier for BEA FIPS
gdp <- suppressWarnings(read_csv(here("data", "input", "bea", "CAGDP9", "CAGDP9__ALL_AREAS_2001_2023.csv"), show_col_types = FALSE))
gdp <- gdp[1:(nrow(gdp) - 4), ]
gdp <- gdp %>%
  rename(fips = GeoFIPS) %>%
  mutate(fips = trimws(fips)) %>%
  filter(!substr(fips, 3, 5) == "000")
bea.fips <- subset(gdp, select = c(fips, GeoName, Region))
bea.fips <- distinct(bea.fips)
bea.fips$GeoName <- iconv(bea.fips$GeoName, from = "latin1", to = "UTF-8")
bea.fips <- filter(bea.fips, grepl("\\+", GeoName))
bea.fips$GeoName <- gsub("\\*", "", bea.fips$GeoName)
bea.fips$stateabb <- str_sub(bea.fips$GeoName, -2, -1)
bea.fips$state <- state.name[match(bea.fips$stateabb, state.abb)]
bea.fips <- bea.fips %>%
  separate_rows(GeoName, sep = "\\+|,") %>%
  mutate(GeoName = str_trim(GeoName)) %>%
  rename(
    fips.bea = fips,
    county = GeoName
  ) %>%
  dplyr::select(fips.bea, county, state)

g <- left_join(g, bea.fips, by =  c("county", "state"))
g$fips.bea <- ifelse(is.na(g$fips.bea), g$fips.pre, g$fips.bea)
g$fips.bea <- as.numeric(g$fips.bea)

g <- g %>%
  dplyr::select(-geometry, geometry)

saveRDS(g, file = here("data", "inter", "survey_visibility_processed_with_geo.rds"))
message(glue("[OK] Survey data with geocoded ZIP codes saved to: {here('data', 'inter', 'survey_visibility_processed_with_geo.rds')}"))

# Save FIPS assignments to cache for anonymized replication
# These are county-level identifiers (not precise locations) and are needed
# for joining with county-level covariates (BEA, broadband, elections, etc.)
# Also includes loc_anom (0/1 flag for geo_subset robustness analysis)
fips_cache <- g %>%
  st_drop_geometry() %>%
  select(response_id, fips, fips.pre, fips.bea, county, state, loc_anom) %>%
  distinct()
write_csv(fips_cache, get_fips_cache_path())
message(glue("[OK] FIPS cache saved to: {get_fips_cache_path()}"))
message(glue("    Contains {nrow(fips_cache)} respondent FIPS assignments"))
message(glue("    Includes loc_anom for geo_subset robustness analysis"))

} # End of REPLICATION_MODE == "full" block